28 实践课-智能体记忆管理与多轮对话

智能体记忆管理与多轮对话

关联:索引

术语小抄(初学者版)



课程思政融入点(口径统一):

不要存进记忆的内容(工程红线):

目标:用最小代码跑通“写入对话→读取最近上下文→保存任务状态→下次启动还能读到”。

1)建立练习目录(建议,PowerShell)

mkdir .\practice_memory_dialogue
cd .\practice_memory_dialogue

解释与自检要点:

2)实现 memory_store.py(可直接复制运行)

# memory_store.py
# 目标:用“最小实现”演示智能体的两类记忆
# - 短期:保存最近 N 轮对话(用于承接上下文/指代)
# - 长期:保存 task_state(用于跨轮/跨进程复现)
import json
import os
import time
import uuid
from dataclasses import dataclass, asdict
from typing import Any, Dict, List, Literal, Optional, Tuple

# 约束 role 取值范围,避免把任意字符串写入对话历史导致混乱
Role = Literal["user", "assistant", "system"]

@dataclass(frozen=True)
class Turn:
    # 一轮对话的最小结构:谁说的 + 说了什么 + 时间戳(毫秒)
    role: Role
    content: str
    ts_ms: int

def _now_ms() -> int:
    # 统一用毫秒时间戳,便于后续做“回放/排序/过期清理”
    return int(time.time() * 1000)

def _safe_text(x: Any) -> str:
    # 把 None/非字符串统一转为可处理的 str,并去掉首尾空白
    s = str(x if x is not None else "").strip()
    return s

class MemoryStore:
    """
    课堂最小实现:
    - short_term: 每个 session 保存最近 N 轮(上下文承接)
    - long_term: 每个 session 保存一份 task_state(任务状态/偏好/字典)
    存储介质:
    - short_term: 内存(演示足够)
    - long_term: JSON 文件(便于跨进程复现)
    """

    def __init__(self, *, base_dir: str, short_term_max_turns: int = 12) -> None:
        # base_dir:长期记忆落盘目录(task_state_*.json)
        self.base_dir = os.path.abspath(base_dir)
        # short_term_max_turns:短期记忆上限(控制上下文长度,避免无限增长)
        self.short_term_max_turns = int(short_term_max_turns)
        # _short_term:只保存在内存里,适合课堂演示;工程里可替换为 Redis/数据库
        self._short_term: Dict[str, List[Turn]] = {}
        os.makedirs(self.base_dir, exist_ok=True)

    def new_session_id(self) -> str:
        # session_id 用于“同一段对话”的绑定;截断仅为更短更好读(工程可不截断)
        return uuid.uuid4().hex[:10]

    def append_turn(self, session_id: str, role: Role, content: str) -> None:
        sid = _safe_text(session_id)
        if not sid:
            raise ValueError("session_id is empty")
        c = _safe_text(content)
        if not c:
            raise ValueError("content is empty")

        # 追加一轮对话,并对超长历史做“滑动窗口截断”
        turns = self._short_term.setdefault(sid, [])
        turns.append(Turn(role=role, content=c, ts_ms=_now_ms()))
        if len(turns) > self.short_term_max_turns:
            self._short_term[sid] = turns[-self.short_term_max_turns :]

    def get_recent_turns(self, session_id: str) -> List[Turn]:
        # 返回“最近 N 轮”的拷贝,避免外部误改内部列表
        sid = _safe_text(session_id)
        return list(self._short_term.get(sid, []))

    def _task_state_path(self, session_id: str) -> str:
        # 每个 session 一份 task_state,文件名中带 session_id 便于定位与复现
        sid = _safe_text(session_id)
        if not sid:
            raise ValueError("session_id is empty")
        return os.path.join(self.base_dir, f"task_state_{sid}.json")

    def load_task_state(self, session_id: str) -> Dict[str, Any]:
        p = self._task_state_path(session_id)
        if not os.path.exists(p):
            return {}
        with open(p, "r", encoding="utf-8") as f:
            return json.load(f)

    def save_task_state(self, session_id: str, state: Dict[str, Any]) -> None:
        if not isinstance(state, dict):
            raise TypeError("state must be a dict")
        p = self._task_state_path(session_id)
        tmp = p + ".tmp"
        # 先写临时文件,再用 os.replace 原子替换:避免写入中断导致坏文件
        with open(tmp, "w", encoding="utf-8") as f:
            json.dump(state, f, ensure_ascii=False, indent=2)
        os.replace(tmp, p)

    def upsert_task_state(self, session_id: str, patch: Dict[str, Any]) -> Dict[str, Any]:
        # upsert:读旧状态 → 合并 patch → 保存 → 返回新状态
        state = self.load_task_state(session_id)
        state.update(patch)
        self.save_task_state(session_id, state)
        return state

def demo() -> Tuple[str, List[Turn], Dict[str, Any]]:
    mem = MemoryStore(base_dir="./mem_data", short_term_max_turns=6)
    session_id = mem.new_session_id()

    # 1) 写入短期记忆(对话历史)
    mem.append_turn(session_id, "system", "你是分拣车间智能助手,回答必须给出可复验证据。")
    mem.append_turn(session_id, "user", "把苹果放到A箱")
    mem.append_turn(session_id, "assistant", "已记录:苹果→A箱(trace_id=demo1)")
    mem.append_turn(session_id, "user", "上一个指令再来一次")

    # 2) 写入长期记忆(任务状态)
    mem.upsert_task_state(session_id, {"last_item": "苹果", "last_bin": "A", "last_action": "put"})
    recent = mem.get_recent_turns(session_id)
    state = mem.load_task_state(session_id)
    return session_id, recent, state

if __name__ == "__main__":
    # 运行方式:py -3 .\memory_store.py
    sid, turns, state = demo()
    print("session_id:", sid)
    print("recent_turns:", [asdict(t) for t in turns])
    print("task_state:", state)

逐段解释与自检要点:

3)运行自测

py -3 .\memory_store.py

解释与自检要点:

目标:对“这个苹果/它/上一个指令”等指代做工程级最小落地:先规则优先,再给 LLM 回退(可选)。

1)实现 coref_resolve.py(可直接复制运行)

# coref_resolve.py
# 目标:把多轮对话里的“指代词”解析成可执行的结构化槽位(slots)
# - “上一个指令/再来一次” → repeat_of + resolved_text 回放
# - “这个/它/刚才那个” → ref_item(旧实体)+ item_name(当前实体)
import re
import uuid
from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple

@dataclass(frozen=True)
class ResolveResult:
    # parse_trace_id:解析链路追踪(定位“理解错”)
    parse_trace_id: str
    # resolved_text:把指代词“尽量还原”为可执行文本(用于回放/后续解析)
    resolved_text: str
    # slots:结构化槽位(给工具调用/路由用)
    slots: Dict[str, Any]

def resolve_sorting_references(text: str, task_state: Dict[str, Any]) -> ResolveResult:
    parse_trace_id = uuid.uuid4().hex[:8]
    t = (text or "").strip()
    state = task_state or {}

    # 从长期任务状态里取“上一轮关键字段”
    last_item = str(state.get("last_item", "")).strip()
    last_action = str(state.get("last_action", "")).strip()
    last_raw_user = str(state.get("last_raw_user", "")).strip()

    resolved = t
    slots: Dict[str, Any] = {}

    # 1) 指令回放类指代:“上一个指令/再来一次”
    if re.search(r"(上一个指令|上一条指令|刚才那个指令|再来一次|重复一次)", t):
        if last_raw_user:
            slots["repeat_of"] = last_raw_user
            resolved = last_raw_user
        else:
            slots["need_clarification"] = True
            slots["clarification_question"] = "我没有找到“上一个指令”的记录,请你补充要重复的具体操作。"

    # 物品指代:分两类情况处理
    # 1) “把它放到B箱”这类:没有显式物品名,需要用 last_item 绑定指代
    # 2) “这个不要了,换成香蕉”这类:既有指代(这个=旧物品),也有显式新物品(香蕉=新物品)
    has_pointer = bool(re.search(r"(这个|它|刚才那个)", t))
    explicit_item = re.search(r"(苹果|香蕉|电池|玻璃)", t)
    if has_pointer:
        if last_item:
            # ref_item 表示“被指代的旧实体”,用于 replace/cancel 等需要引用旧实体的动作
            slots["ref_item"] = last_item
            if not explicit_item:
                # 只有指代、没有显式物品名:可安全替换 “这个/它/刚才那个”
                resolved = re.sub(r"(这个|它|刚才那个)", last_item, resolved)
                slots.setdefault("item_name", last_item)
        else:
            slots["need_clarification"] = True
            slots["clarification_question"] = "你说的“这个/它/刚才那个”指的是哪个物品?请补充物品名称。"

    # 轻量槽位抽取:箱号(A/B/C...)与动作关键词
    m_bin = re.search(r"([A-Z])\s*箱", resolved)
    if m_bin:
        slots["bin"] = m_bin.group(1)
    # 动作识别需要注意优先级:
    # - “不要了,换成X”语义应归类为 replace,而不是 cancel
    # - “怎么分拣/分拣规则”应归类为 query_rule,避免沿用 last_action 导致误判
    if any(x in resolved for x in ["怎么分拣", "分拣规则", "如何分拣", "怎么处理"]):
        slots["action"] = "query_rule"
    elif "换成" in resolved:
        slots["action"] = "replace"
    elif "放" in resolved or "放到" in resolved:
        slots["action"] = "put"
    elif "不要" in resolved or "取消" in resolved:
        slots["action"] = "cancel"

    # 提取显式物品名(若出现“换成香蕉”,则新物品优先)
    m_item = re.search(r"(苹果|香蕉|电池|玻璃)", resolved)
    if m_item:
        slots["item_name"] = m_item.group(1)

    # 记录当前 action(用于下一轮作为 last_action)
    if last_action and "action" not in slots:
        # 只在本轮完全没识别到动作时才回填上一轮动作,避免误判
        slots["action"] = last_action

    return ResolveResult(parse_trace_id=parse_trace_id, resolved_text=resolved, slots=slots)

def demo():
    # demo 的 state 模拟“上一轮留下的长期任务状态”
    state = {"last_item": "苹果", "last_action": "put", "last_raw_user": "把苹果放到A箱"}
    samples = [
        "这个不要了,换成香蕉",
        "上一个指令再来一次",
        "把它放到B箱",
        "刚才那个放到C箱",
        "取消上一个操作",
        "这个苹果怎么分拣?",
    ]
    for s in samples:
        r = resolve_sorting_references(s, state)
        print(r.parse_trace_id, "|", s, "=>", r.resolved_text, "| slots=", r.slots)

if __name__ == "__main__":
    # 运行方式:py -3 .\coref_resolve.py
    demo()

逐段解释与自检要点:

2)运行自测

py -3 .\coref_resolve.py

解释与自检要点:

1)把 samples 扩展为 ≥10 条,并覆盖至少 3 种指代表达(“这个/它/上一个指令/刚才那个/再来一次”)。
提示:把“箱号缺失”“物品缺失”“同时出现新旧物品”都做出来。

2)在 resolve_sorting_references 里增加一个最小规则:当用户说“上一个指令再来一次”,同时又补充“换成香蕉”,你应当输出“重复上一个动作,但物品替换为香蕉”的 slots
提示:把“回放文本”和“本轮补丁(patch)”合并,避免覆盖掉新信息。

每次收到用户输入 text
  1) session_id:从启动参数/HTTP header/用户登录态里拿(课堂可先固定为 "demo")
  2) 读取 task_state(长期)与 recent_turns(短期)
  3) 做指代消解:text -> resolved_text + slots(生成 parse_trace_id)
  4) 把 resolved_text + recent_turns 喂给 Agent
  5) 把本轮用户/助手消息写入 short_term
  6) 把 slots 中关键字段写入 long_term(如 last_item/last_action/last_raw_user)

解释与自检要点:

  1. 对话历史存储:区分短期(最近 N 轮)与长期(任务状态/偏好)。
  2. 上下文关联:每轮把“本轮解析结果(slots)”写入任务状态,用于下一轮承接。
  3. 回复生成:把 resolved_text + 关键状态 + 最近对话 组织成模型输入,生成可执行/可复验的回复。
  4. 评估与优化:用标注数据计算准确率,找到失败模式(误绑/漏绑/过度替换/缺失追问不当),再迭代规则或引入 LLM 回退。

1)实现 dialogue_manager_demo.py(可直接复制运行)

# dialogue_manager_demo.py
# 目标:把“记忆读写 + 指代消解 + 回复生成”串成一个可运行的多轮对话闭环
import uuid
from dataclasses import asdict
from typing import Any, Dict, List, Optional

from memory_store import MemoryStore
from coref_resolve import resolve_sorting_references

def build_reply(resolved_text: str, slots: Dict[str, Any]) -> str:
    """
    课堂最小实现:先不接真实 LLM,只用规则回复,确保没有 Key 也能跑通。
    你们在项目集成时,把这里替换成 Agent.invoke(...) 的结果即可。
    """
    if slots.get("need_clarification"):
        # 缺失关键槽位时宁可追问,不要瞎绑定导致误操作
        return f"需要澄清:{slots.get('clarification_question','请补充关键信息')}(trace_id=local_demo)"
    action = slots.get("action") or "unknown"
    item = slots.get("item_name") or "UNKNOWN_ITEM"
    bin_ = slots.get("bin")
    if action == "query_rule":
        # 工程里通常会在这里调用“查询规则工具/知识库工具”
        return f"已理解:查询{item}的分拣规则(trace_id=local_demo)"
    if action == "put" and bin_:
        return f"已理解:把{item}放到{bin_}箱(trace_id=local_demo)"
    if action == "replace":
        return f"已理解:替换物品为{item},将沿用上一条动作/箱位(trace_id=local_demo)"
    if action == "cancel":
        return f"已理解:取消上一条操作(trace_id=local_demo)"
    return f"已收到:{resolved_text}(trace_id=local_demo)"

class DialogueManager:
    def __init__(self, mem: MemoryStore) -> None:
        # mem:负责短期对话历史 + 长期 task_state 的读写
        self.mem = mem

    def handle(self, session_id: str, user_text: str) -> Dict[str, Any]:
        # trace_id:对话链路追踪(定位“执行/生成错”)
        trace_id = uuid.uuid4().hex[:8]

        # 1) 读长期任务状态(用于指代消解)
        task_state = self.mem.load_task_state(session_id)
        # 2) 指代消解:把“它/这个/上一个指令”解析为结构化槽位
        resolve = resolve_sorting_references(user_text, task_state)

        # 3) 生成回复(课堂用规则;工程可替换为 Agent.invoke(...))
        assistant_text = build_reply(resolve.resolved_text, resolve.slots)

        # 4) 写短期对话历史(最近 N 轮)
        self.mem.append_turn(session_id, "user", user_text)
        self.mem.append_turn(session_id, "assistant", assistant_text)

        # 5) 回写长期状态:把“可复用字段”固化,供下一轮承接
        patch = {
            "last_raw_user": user_text,
            "last_action": resolve.slots.get("action", task_state.get("last_action", "")),
        }
        if resolve.slots.get("item_name"):
            patch["last_item"] = resolve.slots["item_name"]
        if resolve.slots.get("bin"):
            patch["last_bin"] = resolve.slots["bin"]
        self.mem.upsert_task_state(session_id, patch)

        # 6) 返回结构化结果:方便你们验收/截图/统计
        return {
            "ok": True,
            "trace_id": trace_id,
            "parse_trace_id": resolve.parse_trace_id,
            "resolved_text": resolve.resolved_text,
            "slots": resolve.slots,
            "assistant_text": assistant_text,
            "recent_turns": [asdict(t) for t in self.mem.get_recent_turns(session_id)],
            "task_state": self.mem.load_task_state(session_id),
        }

def demo():
    # 课堂演示用固定 session_id,方便复现“多轮承接”
    mem = MemoryStore(base_dir="./mem_data", short_term_max_turns=8)
    dm = DialogueManager(mem)
    sid = "demo"
    samples = [
        "把苹果放到A箱",
        "上一个指令再来一次",
        "这个不要了,换成香蕉",
        "把它放到B箱",
    ]
    for s in samples:
        out = dm.handle(sid, s)
        print(out["trace_id"], out["parse_trace_id"], "|", out["assistant_text"])

if __name__ == "__main__":
    # 运行方式:py -3 .\dialogue_manager_demo.py
    demo()

逐段解释与自检要点:

2)运行自测

py -3 .\dialogue_manager_demo.py

解释与自检要点:

目标:用“有标签的数据”衡量指代消解效果,而不是靠感觉调参。

1)定义标注数据格式(JSONL,一行一条样本)

{"id":"c1","type":"repeat","task_state":{"last_raw_user":"把苹果放到A箱","last_item":"苹果","last_action":"put","last_bin":"A"},"input":"上一个指令再来一次","expected":{"resolved_text":"把苹果放到A箱","slots":{"repeat_of":"把苹果放到A箱","action":"put","item_name":"苹果","bin":"A"}}}
{"id":"c2","type":"pronoun","task_state":{"last_raw_user":"把苹果放到A箱","last_item":"苹果","last_action":"put"},"input":"把它放到B箱","expected":{"slots":{"ref_item":"苹果","item_name":"苹果","bin":"B","action":"put"}}}
{"id":"c3","type":"replace","task_state":{"last_raw_user":"把苹果放到A箱","last_item":"苹果","last_action":"put"},"input":"这个不要了,换成香蕉","expected":{"slots":{"ref_item":"苹果","action":"replace","item_name":"香蕉"}}}
{"id":"c4","type":"need_clarify","task_state":{},"input":"把它放到A箱","expected":{"slots":{"need_clarification":true}}}
{"id":"c5","type":"query_rule","task_state":{"last_raw_user":"把电池放到C箱","last_item":"电池","last_action":"put"},"input":"刚才那个怎么分拣?","expected":{"slots":{"ref_item":"电池","item_name":"电池","action":"query_rule"}}}
{"id":"c6","type":"query_rule","task_state":{"last_raw_user":"把苹果放到A箱","last_item":"苹果","last_action":"put"},"input":"这个苹果怎么分拣?","expected":{"slots":{"ref_item":"苹果","item_name":"苹果","action":"query_rule"}}}

解释与自检要点:

2)实现评估脚本 evaluate_coref.py

# evaluate_coref.py
# 目标:用标注数据对“指代消解”做可量化评估(不要靠感觉调)
import json
from typing import Any, Dict, List, Tuple

from coref_resolve import resolve_sorting_references

def _as_dict(x: Any) -> Dict[str, Any]:
    # 容错:只接受 dict,其它类型一律按空 dict 处理,避免评估脚本崩溃
    return x if isinstance(x, dict) else {}

def _match_slots(pred: Dict[str, Any], exp: Dict[str, Any], keys: List[str]) -> Tuple[int, int, List[str]]:
    # 逐字段对比:
    # - 只对 exp(标注)中出现的字段计入分母(更贴近真实评测:没标注就不算错)
    hit = 0
    total = 0
    miss_keys: List[str] = []
    for k in keys:
        if k not in exp:
            continue
        total += 1
        if pred.get(k) == exp.get(k):
            hit += 1
        else:
            miss_keys.append(k)
    return hit, total, miss_keys

def main():
    path = "./coref_labeled.jsonl"
    # 更严格:把 ref_item 也纳入评估(旧实体绑定是否正确)
    keys_to_check = ["ref_item", "item_name", "bin", "action", "need_clarification"]

    # slot_*:槽位层面命中率(更宽松)
    slot_hit_sum = 0
    slot_total_sum = 0
    # case_*:样本层面命中率(更严格:一个样本所有期望槽位都对才算对)
    case_hit_sum = 0
    case_total_sum = 0
    # resolved_text_*:可选指标(仅当标注里提供 expected.resolved_text 时统计)
    resolved_text_hit_sum = 0
    resolved_text_total_sum = 0

    # by_type:按样本 type(能力点)聚合,方便定位短板
    by_type: Dict[str, Dict[str, int]] = {}
    bad_cases: List[Dict[str, Any]] = []

    def _bump(t: str, k: str, v: int) -> None:
        d = by_type.setdefault(t, {})
        d[k] = d.get(k, 0) + v

    def _classify_failure(miss_keys: List[str], pred: Dict[str, Any], exp: Dict[str, Any]) -> str:
        # 最小归因:用于把失败样例聚类,辅助写“失败模式分析”
        if exp.get("need_clarification") is True and pred.get("need_clarification") is not True:
            return "should_clarify"
        if "action" in miss_keys:
            return "action_mismatch"
        if "item_name" in miss_keys:
            return "item_mismatch"
        if "ref_item" in miss_keys:
            return "ref_item_mismatch"
        if "bin" in miss_keys:
            return "bin_mismatch"
        return "other"

    fail_modes: Dict[str, int] = {}

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            sample_type = str(obj.get("type", "unknown"))
            task_state = _as_dict(obj.get("task_state"))
            input_text = str(obj.get("input", ""))
            expected_obj = _as_dict(obj.get("expected"))
            expected_slots = _as_dict(expected_obj.get("slots"))
            expected_resolved_text = expected_obj.get("resolved_text")

            # 1) 跑被测函数
            r = resolve_sorting_references(input_text, task_state)
            pred = _as_dict(r.slots)

            # 2) 统计 slot_accuracy(按字段)
            hit, total, miss_keys = _match_slots(pred, expected_slots, keys_to_check)
            slot_hit_sum += hit
            slot_total_sum += total
            _bump(sample_type, "slot_hit", hit)
            _bump(sample_type, "slot_total", total)

            # 3) 统计 case_accuracy(按样本)
            case_total_sum += 1
            _bump(sample_type, "case_total", 1)
            if not miss_keys:
                case_hit_sum += 1
                _bump(sample_type, "case_hit", 1)

            # 4) 可选:统计 resolved_text_accuracy
            if expected_resolved_text is not None:
                resolved_text_total_sum += 1
                _bump(sample_type, "resolved_text_total", 1)
                if str(r.resolved_text) == str(expected_resolved_text):
                    resolved_text_hit_sum += 1
                    _bump(sample_type, "resolved_text_hit", 1)

            if miss_keys:
                # 5) 收集失败样例,便于定向改规则/加回退
                mode = _classify_failure(miss_keys, pred, expected_slots)
                fail_modes[mode] = fail_modes.get(mode, 0) + 1
                bad_cases.append(
                    {
                        "id": obj.get("id"),
                        "type": sample_type,
                        "input": input_text,
                        "miss_keys": miss_keys,
                        "fail_mode": mode,
                        "expected": expected_slots,
                        "pred": pred,
                        "resolved_text": r.resolved_text,
                        "expected_resolved_text": expected_resolved_text,
                        "parse_trace_id": r.parse_trace_id,
                    }
                )

    # 统一输出指标,便于截图与写作业
    slot_acc = (slot_hit_sum / slot_total_sum) if slot_total_sum else 0.0
    case_acc = (case_hit_sum / case_total_sum) if case_total_sum else 0.0
    resolved_text_acc = (resolved_text_hit_sum / resolved_text_total_sum) if resolved_text_total_sum else 0.0

    print(f"slot_accuracy={slot_acc:.3f} ({slot_hit_sum}/{slot_total_sum})")
    print(f"case_accuracy={case_acc:.3f} ({case_hit_sum}/{case_total_sum})")
    if resolved_text_total_sum:
        print(f"resolved_text_accuracy={resolved_text_acc:.3f} ({resolved_text_hit_sum}/{resolved_text_total_sum})")
    print(f"bad_cases={len(bad_cases)}")
    print("fail_modes:", dict(sorted(fail_modes.items(), key=lambda x: (-x[1], x[0]))))
    print("by_type:")
    for t in sorted(by_type.keys()):
        d = by_type[t]
        t_slot_acc = (d.get("slot_hit", 0) / d.get("slot_total", 1)) if d.get("slot_total") else 0.0
        t_case_acc = (d.get("case_hit", 0) / d.get("case_total", 1)) if d.get("case_total") else 0.0
        print(f"- {t}: slot_acc={t_slot_acc:.3f} case_acc={t_case_acc:.3f} n={d.get('case_total',0)}")
    for c in bad_cases[:8]:
        print("----")
        print("id:", c["id"])
        print("type:", c["type"])
        print("fail_mode:", c["fail_mode"])
        print("input:", c["input"])
        print("miss_keys:", c["miss_keys"])
        print("expected:", c["expected"])
        print("pred:", c["pred"])
        print("resolved_text:", c["resolved_text"])
        if c.get("expected_resolved_text") is not None:
            print("expected_resolved_text:", c["expected_resolved_text"])
        print("parse_trace_id:", c["parse_trace_id"])

if __name__ == "__main__":
    # 运行方式:py -3 .\evaluate_coref.py
    main()

逐段解释与自检要点:

3)运行评估

py -3 .\evaluate_coref.py

本节课 AI 的正确打开方式(工程闭环,不靠玄学):

1)让 AI 生成“骨架”,你负责“约束、审计、复验”。
2)把失败样例喂给 AI,让它提出“可落地的最小改动”,然后你用 evaluate_coref.py 回归验证。
3)所有 AI 输出必须满足:可运行、可解释、可复验(至少能跑评估脚本提升指标或减少失败样例)。

1)给 AI 的指令模板(可直接复制)

你是资深 Python 工程师。请基于以下函数与失败样例,改进指代消解逻辑:
1) 只允许修改 resolve_sorting_references(text, task_state)
2) 不要引入第三方库,只用标准库
3) 必须保持:缺失信息时追问澄清(need_clarification=true)
4) 不允许把用户明确说出的新物品覆盖掉
5) 给出你修改后的完整函数代码,并说明:每条失败样例为什么会失败、你改动如何修复、可能带来的新风险

函数当前版本(粘贴你的代码):
...

失败样例(从 evaluate_coref.py 输出中粘贴 5 条):
...

解释与自检要点:


作业:布置

1)提交记忆模块集成后的项目代码、配置文件截图。
要求:截图能证明短期/长期记忆的配置项与落盘位置;代码中能找到写入/读取关键逻辑。

2)提交多轮对话功能测试记录(含 5 组以上对话示例),记录指代消解准确率。
要求:每组对话至少 3 轮;每轮给出输入、解析结果(含 parse_trace_id/关键 slots)、输出;附总准确率(例如 slot_accuracy=0.83)。

3)提交 AI 交互记录(生成记忆管理代码、优化上下文逻辑的过程),附 100 字左右优化说明。
要求:包含你的约束指令、AI 输出、你的审计结论(保留/拒绝与理由)、回归验证结果(评估脚本输出或对比表)。